#setwd("~/Dropbox/Shrew_Bat_Diet_Trial/Scripts/Analyses")
library(phyloseq)
library(ggplot2)
library(dplyr)
library(vegan)
#library(decontam)
library(stringr)
library(knitr)
library(hilldiv)
library(kableExtra)
library(gridExtra)
library(reshape2)

1. Introduction

Prior to this script, sequences have been clustered together into MOTUs using sumaclust at similarity thresholds between 95% and 98%. Singletons have already been removed and MOTUs have already been assigned to taxonomy using blastn and NCBI database (see script XXX). The following batch of script were used to process/filter the MOTUs and clarify how many MOTUs are retained after filtering (i.e. removing low abundance samples and MOTUs and removing non=prey taxa such as vertebrate or parasitic taxa).

The code is shown for Gillet at 95% and Zeale at 95% clustering. The same code is repeated for thresholds between 96% - 98%. The full code is available on request, however it is just repeated chunks.

2. Gillet

2.1. Code (example using Gillet dataset after clustering sequences at 95%)

2.1.1. Prepare Samples

# Load Dataset
load("../gillet_dataset/sumaclust95/phyloseq_object_clust95_iden_taxa_assigned_no_singletons.RData")
# Change MOTU names to gMOTU_1, gMOTU_2 etc etc
taxa_names(gillet.phylo95) <- paste("gMOTU", seq(nrow(tax_table(gillet.phylo95))), sep="_")
# quickly remove very low read samples 
gillet.phylo <- prune_samples(sample_sums(gillet.phylo95) > 50, gillet.phylo95)
# examine what we're looking at
gillet.phylo
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2170 taxa and 50 samples ]
## sample_data() Sample Data:       [ 50 samples by 9 sample variables ]
## tax_table()   Taxonomy Table:    [ 2170 taxa by 10 taxonomic ranks ]
# Remove 'sample.' part of sample names that obitools annoyingly adds
chck <- sample_names(gillet.phylo)
chck <- str_remove(chck, "sample.")
sample_names(gillet.phylo) <- chck

2.1.2. Examine Blanks

Whats in the blanks?

n <- which(otu_table(gillet.phylo)[,"G_NEG"] > 0)
#otu_table(gillet.phylo)[n,"G_NEG"]
#tax_table(gillet.phylo)[n,3:7]

m <- which(otu_table(gillet.phylo)[,"G_Neg"] > 0)
#otu_table(gillet.phylo)[m,"G_Neg"]
#tax_table(gillet.phylo)[m,3:7]

l <- unique(c(n,m))
blnk.df <- as.data.frame(as.matrix(tax_table(gillet.phylo))[l,4:7])
blnk.df$total.reads <- taxa_sums(gillet.phylo)[l]
blnk.df$Neg1.reads <- otu_table(gillet.phylo)[l, "G_NEG"]
for (i in 1:nrow(blnk.df)) {blnk.df$Neg1.prop[i] <- (blnk.df$Neg1.reads[i] / blnk.df$total.reads[i]) * 100 }
blnk.df$Neg2.reads <- otu_table(gillet.phylo)[l, "G_Neg"]
for (i in 1:nrow(blnk.df)) {blnk.df$Neg2.prop[i] <- (blnk.df$Neg2.reads[i] / blnk.df$total.reads[i]) * 100 }
blnk.df$perc <- apply(blnk.df[,c("Neg1.prop","Neg2.prop")], 1, sum)
rownames(blnk.df) <- 1:nrow(blnk.df)

#kable(blnk.df, caption = "MOTUs identified in the blanks")

2.1.3. Table: MOTUs in Blanks

landscape(knitr::kable(blnk.df, caption = "MOTUs identified in the blanks") %>%
            kable_styling(bootstrap_options = "striped", 
                          full_width = F,
                          position = "left"))
MOTUs identified in the blanks
order family genus species total.reads Neg1.reads Neg1.prop Neg2.reads Neg2.prop perc
Chiroptera Rhinolophidae Family_Rhinolophidae Family_Rhinolophidae 3975502 3241 0.0815243 1862 0.0468369 0.1283611
Primates Hominidae Homo Homo_sapiens 955 11 1.1518325 11 1.1518325 2.3036649
Class_Mammalia Class_Mammalia Class_Mammalia Class_Mammalia 2 1 50.0000000 0 0.0000000 50.0000000
Eulipotyphla Soricidae Family_Soricidae Family_Soricidae 1941340 0 0.0000000 804 0.0414147 0.0414147
Eulipotyphla Soricidae Crocidura Crocidura_russula 426731 0 0.0000000 2180 0.5108605 0.5108605
Hymenoptera Cynipidae Neuroterus Neuroterus_quercusbaccarum 17794 0 0.0000000 5 0.0280994 0.0280994
Diptera Order_Diptera Order_Diptera Order_Diptera 6552 0 0.0000000 3 0.0457875 0.0457875
Diptera Agromyzidae Family_Agromyzidae Family_Agromyzidae 6202 0 0.0000000 2 0.0322477 0.0322477
Diptera Order_Diptera Order_Diptera Order_Diptera 2231 0 0.0000000 2 0.0896459 0.0896459
Eulipotyphla Soricidae Crocidura Genus_Crocidura 1530 0 0.0000000 1 0.0653595 0.0653595
Primates Hominidae Homo Homo_sapiens 820 0 0.0000000 35 4.2682927 4.2682927
Eulipotyphla Soricidae Family_Soricidae Family_Soricidae 1678 0 0.0000000 1 0.0595948 0.0595948
Araneae Linyphiidae Hypomma Hypomma_cornutum 563 0 0.0000000 1 0.1776199 0.1776199
Class_Mammalia Class_Mammalia Class_Mammalia Class_Mammalia 388 0 0.0000000 1 0.2577320 0.2577320
Eulipotyphla Order_Eulipotyphla Order_Eulipotyphla Order_Eulipotyphla 208 0 0.0000000 1 0.4807692 0.4807692
Diptera Scathophagidae Family_Scathophagidae Family_Scathophagidae 63 0 0.0000000 1 1.5873016 1.5873016
Hymenoptera Order_Hymenoptera Order_Hymenoptera Order_Hymenoptera 49 0 0.0000000 1 2.0408163 2.0408163
Eulipotyphla Soricidae Crocidura Genus_Crocidura 155 0 0.0000000 2 1.2903226 1.2903226
Chiroptera Order_Chiroptera Order_Chiroptera Order_Chiroptera 84 0 0.0000000 1 1.1904762 1.1904762
Class_Mammalia Class_Mammalia Class_Mammalia Class_Mammalia 86 0 0.0000000 1 1.1627907 1.1627907
Class_Insecta Class_Insecta Class_Insecta Class_Insecta 31 0 0.0000000 2 6.4516129 6.4516129
Class_Aves Class_Aves Class_Aves Class_Aves 2 0 0.0000000 1 50.0000000 50.0000000
Class_Aves Class_Aves Class_Aves Class_Aves 2 0 0.0000000 1 50.0000000 50.0000000
Class_Aves Class_Aves Class_Aves Class_Aves 5 0 0.0000000 1 20.0000000 20.0000000
Diptera Culicidae Family_Culicidae Family_Culicidae 4 0 0.0000000 4 100.0000000 100.0000000
Diptera Order_Diptera Order_Diptera Order_Diptera 2 0 0.0000000 1 50.0000000 50.0000000
Diptera Tipulidae Tipula Genus_Tipula 3 0 0.0000000 1 33.3333333 33.3333333

Remove taxa of which the blanks hold over 2% of the total reads for that MOTU

tab.nam <- blnk.df[,"perc"] > 2 # 2 is for 2%
tab.df <- blnk.df[tab.nam,]

removeTaxa <- rownames(tab.df) # Lists the MOTUs to remove 
phy.obj <- subset_taxa(gillet.phylo, !(taxa_names(gillet.phylo) %in% removeTaxa))
# How many MOTUs left?
phy.obj
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2170 taxa and 50 samples ]
## sample_data() Sample Data:       [ 50 samples by 9 sample variables ]
## tax_table()   Taxonomy Table:    [ 2170 taxa by 10 taxonomic ranks ]

2.1.4. Level of Host Amplification

# Spare palette from https://graphicdesign.stackexchange.com/questions/3682/where-can-i-find-a-large-palette-set-of-contrasting-colors-for-coloring-many-d
# Palette is from study to create most contrasting colours
pal.o = c("#f0a3ff", # Araneae  
          "#0075dc", # Arch *  
          "#993f00", # Coleoptera
          "#4c005c", # Diptera *
          "#191919", # Ento *
          "#005c31", # Geo *
          "#2bce48", # Glom
          "#ffcc99", # Hap *
          "#808080", # Hemip
          "#94ffb5", # Hymen
          "#8f7c00", # Isopoda
          "#9dcc00", # Julida
          "#c20088", # Lepidop
          "blue", # Lithobiomorpha *
          "#ffa405", # Mesostig
          "#ffa8bb", # Neuropter *
          "#426600", # Oppiliones *
          "#ff0010", # Orbitida *
          "#5ef1f2",# Orthoptera *
          "#00998f", # Poly *
          "#e0ff66", # psocoptera
          "indianred",     # Sarc 
          "#003380",    # stylo
          "green", # trom *
          "khaki4",
          "darkred",
          "coral4",
          "violetred2",
          "#0075dc",
          "#993f00") 


samples.phylo <- gillet.phylo

samples.phylo <- tax_glom(gillet.phylo, taxrank = "order")
n <- grep("Chiroptera", tax_table(samples.phylo)[,"order"])
m <- grep("Eulipotyphla", tax_table(samples.phylo)[,"order"])
tax_table(samples.phylo)[-c(n, m), 1:4] <- "Other"

# What are the 3 unique taxa orders?
unique(tax_table(samples.phylo)[,"order"])
## Taxonomy Table:     [3 taxa by 1 taxonomic ranks]:
##         order         
## gMOTU_1 "Chiroptera"  
## gMOTU_2 "Eulipotyphla"
## gMOTU_5 "Other"
# Agglomerate MOTUs to order level
mm.oth <- tax_glom(samples.phylo, taxrank = "order")
# Note that phyloseq has 3 taxa (Chiroptera, Eulipotyphla, Other)
mm.oth
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 3 taxa and 50 samples ]
## sample_data() Sample Data:       [ 50 samples by 9 sample variables ]
## tax_table()   Taxonomy Table:    [ 3 taxa by 10 taxonomic ranks ]
# transform read counts to relative abundance
mm.oth.ra = transform_sample_counts(mm.oth, function(x) 100 * x/sum(x))

ra.samples.bar <- phyloseq::plot_bar(mm.oth.ra) # extracts information needed for barplots
ra.samples.bar.data <- ra.samples.bar$data
p1.95 <- ggplot(ra.samples.bar.data, aes(x= Sample, y=Abundance, fill = order)) + 
  geom_bar(stat="identity", color="black") +
  scale_fill_manual(values = pal.o) +
  facet_wrap(~ mammal, scale = "free_x") +
  theme_classic() +
  theme(legend.position = "right") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

2.1.5. Figure: Host Amplification

p1.95
Composition of prey reads compared to host - clustered at 95%

Composition of prey reads compared to host - clustered at 95%

Check range of vertebrate amplification in samples

samples.phylo <- subset_samples(phy.obj, mammal != "BLANK")

n <- grep("Chordata", tax_table(samples.phylo)[,"phylum"])
tax_table(samples.phylo)[-n, 1:4] <- "Other"
#unique(tax_table(samples.phylo)[,"phylum"])

mm.oth <- tax_glom(samples.phylo, taxrank = "phylum")
#mm.oth
mm.oth.ra = transform_sample_counts(mm.oth, function(x) 100 * x/sum(x))

# Check Taxa
taxa_names(mm.oth.ra) <- c("Vertebrate", "Other")
tax_table(mm.oth.ra)[,1:2]
## Taxonomy Table:     [2 taxa by 2 taxonomic ranks]:
##            kingdom   phylum    
## Vertebrate "Metazoa" "Chordata"
## Other      "Other"   "Other"
# Proportion of host reads in each sample
otu_table(mm.oth.ra)
## OTU Table:          [2 taxa and 48 samples]
##                      taxa are rows
##                 G_1     G_10      G_11      G_12      G_13      G_14
## Vertebrate 89.06104 97.44169 98.685607 98.875154 97.660007 92.095764
## Other      10.93896  2.55831  1.314393  1.124846  2.339993  7.904236
##                 G_15     G_16     G_17      G_18       G_19       G_2
## Vertebrate 97.956883 86.62167 78.16822 97.878097 99.5345188 97.801715
## Other       2.043117 13.37833 21.83178  2.121903  0.4654812  2.198285
##                 G_20      G_21     G_22     G_23     G_24        G_3
## Vertebrate 97.866227 91.206891 89.45764 84.74596 64.51326 99.6463573
## Other       2.133773  8.793109 10.54236 15.25404 35.48674  0.3536427
##                   G_4      G_5       G_6       G_7       G_8       G_9
## Vertebrate 99.4643312 90.73662 94.264268 94.297427 95.609202 95.563988
## Other       0.5356688  9.26338  5.735732  5.702573  4.390798  4.436012
##                G_E26      G_E27    G_E29    G_E31    G_E32     G_E33
## Vertebrate 96.158934 99.6653127 11.93543 92.15071 26.50477 90.026674
## Other       3.841066  0.3346873 88.06457  7.84929 73.49523  9.973326
##                 G_E47   G_FR14    G_FR15    G_FR16    G_FR17    G_FR20
## Vertebrate 99.0556457  4.38972  3.199108  5.803229 90.425228 91.468491
## Other       0.9443543 95.61028 96.800892 94.196771  9.574772  8.531509
##              G_FR21     G_S18    G_S19    G_S21     G_S33      G_S34
## Vertebrate 89.20737 95.417696 22.61281 40.02194 98.843105 99.8711241
## Other      10.79263  4.582304 77.38719 59.97806  1.156895  0.1288759
##                G_S48    G_S50      G_S51     G_W12    G_W13    G_W14
## Vertebrate 95.932179 74.20471  0.8261888 95.950804 79.85832 86.18168
## Other       4.067821 25.79529 99.1738112  4.049196 20.14168 13.81832
# Range of Vertebrate amplification across all samples (%)
range(otu_table(mm.oth.ra)[1,])
## [1]  0.8261888 99.8711241
# Range of Vertebrate amplification across GWTS (%)
gwts.prop <- subset_samples(mm.oth.ra, mammal == "GWTS")
range(otu_table(gwts.prop)[1,])
## [1] 89.20737 99.87112
# Range of Vertebrate amplification across Pygmies (%)
pyg.prop <- subset_samples(mm.oth.ra, mammal == "Pygmy")
range(otu_table(pyg.prop)[1,])
## [1]  0.8261888 95.9508037
# Range of Vertebrate amplification across Bats (%)
bat.prop <- subset_samples(mm.oth.ra, mammal == "Bat")
range(otu_table(bat.prop)[1,])
## [1] 64.51326 99.64636

2.1.6. Remove non-prey taxa

# Remove negative controls
samples.phylo <- subset_samples(phy.obj, mammal != "BLANK")
# Remove non-prey taxa
diet.prey <- subset_taxa(samples.phylo, !(class %in% c("Mammalia", 
                                                       "none",
                                                       "Actinopteri",
                                                       "Bdelloidea",
                                                       "Udeonychophora", # velvet worms
                                                       "Merostomata", # horse shoe crabs
                                                       "Gammaproteobacteria", # bacteria
                                                       "Magnoliopsida", # plants
                                                       "Monogononta", # rotifers
                                                       "Dothideomycetes", # fungi
                                                       "Trebouxiophyceae", # green algae
                                                       "Chondrichthyes", # Cartilaginous fish
                                                       "Mucoromycetes", # fungi
                                                       "Phylum_Endomyxa", # micro things
                                                       "Eutardigrada", # tartigrades!!
                                                       "Elardia", # Amoebas
                                                       "Cephalopoda", # Cephalopods
                                                       "Amphibia", # Amphibians
                                                       "Aves", # Birds
                                                       "Chromadorea", # roundworms
                                                       "Hexanauplia",  # parasitic crustaceans
                                                       "Kingdom_Metazoa",
                                                       "Kingdom_",
                                                       "Phylum_Discosea", # amoebas
                                                       "Branchiopoda", # marine crustaceans
                                                       "Phylum_Nematoda")))

2.1.7. Filter low read Samples and MOTUs

# remove samples with less than 1000 reads
sampl.filt <- prune_samples(sample_sums(diet.prey) > 1000, diet.prey)
# Extract OTU table
otu.tab <- as.data.frame(otu_table(sampl.filt))
# Remove MOTUs from samples that have < 0.01% total reads of that sample
new.otu.tab <- hilldiv::copy_filt(otu.tab, 0.0001)
# Assign cleaned matrix to phyloseq object
new.otu.tab <- as.matrix(new.otu.tab)
otu_table(sampl.filt) <- otu_table(new.otu.tab, taxa_are_rows = TRUE)
# Check amount of remaining MOTUs
sampl.filt
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 1075 taxa and 44 samples ]
## sample_data() Sample Data:       [ 44 samples by 9 sample variables ]
## tax_table()   Taxonomy Table:    [ 1075 taxa by 10 taxonomic ranks ]
# Remove MOTUs that are represented by < 5 reads in total
final.diet <- prune_taxa(taxa_sums(sampl.filt) > 4, sampl.filt)
# The final dataset is...
final.diet
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 540 taxa and 44 samples ]
## sample_data() Sample Data:       [ 44 samples by 9 sample variables ]
## tax_table()   Taxonomy Table:    [ 540 taxa by 10 taxonomic ranks ]

2.1.8. Diet composition

order.glom <- tax_glom(final.diet, taxrank= "order")
ntx <- as.numeric(ntaxa(order.glom))
top25ord <- prune_taxa(names(sort(taxa_sums(order.glom),TRUE)[1:19]), order.glom)
othr.ord <- prune_taxa(names(sort(taxa_sums(order.glom),TRUE)[20:ntx]), order.glom)

n <- taxa_names(othr.ord)
tax_table(order.glom)[n, 1:4] <- "Other"
n <- grep("Class_", tax_table(order.glom)[,"order"])
tax_table(order.glom)[n, 1:4] <- "Other"
top25ord <- tax_glom(order.glom, taxrank= "order")

ra.samples = transform_sample_counts(top25ord, function(x) 100 * x/sum(x))
ra.samples.bar <- phyloseq::plot_bar(ra.samples) # extracts information needed for barplots
ra.samples.bar.data <- ra.samples.bar$data
p2.95 <- ggplot(ra.samples.bar.data, aes(x= Sample, y=Abundance, fill = order)) +
  geom_bar(stat="identity", color="black") +
  scale_fill_manual(values = pal.o) +
  facet_wrap(~ mammal, scale = "free_x") +
  theme_classic() + # appearance composition
  theme(strip.text.x = element_text(size = 15, face = "bold")) + # facet text
#  scale_x_discrete(labels=c("Bat_Gillet" = "Bat\nN=22", "GWTS_Gillet" = "GWTS\nN=7", "Pygmy_Gillet" = "Pygmy\nN=15",
#                            "Bat_Zeale" = "Bat\nN=23", "GWTS_Zeale" = "GWTS\nN=4", "Pygmy_Zeale" = "Pygmy\nN=11")) +
    theme(legend.position = "right",
        legend.title = element_blank(),
        legend.text = element_text(size = 10),
        legend.key.size = unit(0.8, "cm")) + 
  ggtitle("Composition of diet - clustered at 95%") +
  labs(y = "Relative Abundance") +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_text(size = 20, 
                                    face = "bold")) +
  theme(axis.text.x = element_text(size = 12 , 
                                   face = "bold",
                                   angle = 45, 
                                   hjust = 1),
        axis.text.y = element_text(size = 12, 
                                   face = "bold"))

2.1.9. Figure: Diet Composition

p2.95
Composition of prey reads compared to host - clustered at 95%

Composition of prey reads compared to host - clustered at 95%

# Range of sample depth of samples
range(sample_sums(final.diet))
## [1]   1120 318945
# Range of read depth of taxa
range(taxa_sums(final.diet))
## [1]      5 172198
# Average read depth per sample
mean(sample_sums(final.diet))
## [1] 37982.2
# hill_div packages assessment taxa coverage per sample, according to a shannon diversity equivilent
depth_cov(new.otu.tab, 
          qvalue = 1)
##         Depth Observed Estimated Coverage
## G_1     35501     9.93      9.94    99.89
## G_10     5049    17.14     17.24    99.42
## G_11     2413    14.42     14.56    99.00
## G_12     1384    13.94     14.21    98.14
## G_13     1724     8.09      8.18    98.93
## G_14     7499     9.06      9.10    99.57
## G_15     3594    14.39     14.61    98.52
## G_16    15633    21.64     21.70    99.73
## G_17    23338     1.62      1.62    99.96
## G_18     1496     3.23      3.25    99.54
## G_2      7363     4.96      4.97    99.64
## G_20     2049    10.10     10.17    99.31
## G_21    14261    12.25     12.28    99.76
## G_22     7766     7.89      7.91    99.68
## G_23    25038     5.54      5.55    99.91
## G_24    35371     4.17      4.18    99.93
## G_4      1219     2.76      2.78    99.35
## G_5     18790     1.89      1.89    99.95
## G_6     11034     3.92      3.93    99.88
## G_7     10144     1.52      1.52    99.85
## G_8      9687    12.37     12.42    99.63
## G_9      7021    14.87     14.95    99.50
## G_E26    8841     4.31      4.33    99.70
## G_E29   98018     7.17      7.18    99.99
## G_E31    3088     2.69      2.71    99.43
## G_E32   65729     4.01      4.02    99.99
## G_E33   20263     4.32      4.32    99.95
## G_E47    2496     1.12      1.12    99.62
## G_FR14 318945     4.19      4.19   100.00
## G_FR15 279150     3.23      3.23    99.99
## G_FR16 181397     4.94      4.94    99.99
## G_FR17   1713     2.86      2.87    99.85
## G_FR20  19879     1.47      1.47    99.93
## G_FR21  42139     7.80      7.81    99.97
## G_S18    4378     4.33      4.33    99.82
## G_S19   72526     4.64      4.64    99.98
## G_S21   34698     3.63      3.63    99.98
## G_S33    2475     4.88      4.91    99.47
## G_S48   10581     1.54      1.54    99.95
## G_S50   13493    11.25     11.26    99.93
## G_S51  225301     2.50      2.50   100.00
## G_W12    1123     3.82      3.86    99.15
## G_W13    6602     3.42      3.43    99.70
## G_W14   11386     6.75      6.75    99.96

2.1.10. Rarefaction

# Seperate phyloseq object per mammal species
Bat_G <- prune_samples(sample_data(final.diet)$mammal == "Bat", final.diet)
df1 <- as.data.frame(t(as.matrix(otu_table(Bat_G))))
gwts_G <- prune_samples(sample_data(final.diet)$mammal == "GWTS", final.diet)
df2 <- as.data.frame(t(as.matrix(otu_table(gwts_G))))
pyg_G <- prune_samples(sample_data(final.diet)$mammal == "Pygmy", final.diet)
df3 <- as.data.frame(t(as.matrix(otu_table(pyg_G))))

# Grab random subset of 10 individuals from each mammal species to test
set.seed(57)
# Bat sample rarefaction
x <- sample(nrow(df1), 10)
r1 <- rarecurve(df1[x,])

# GWTS sample rarefaction
#y <- sample(nrow(df2), 10)
r2 <- rarecurve(df2[,])

# Pygmy sample rarefaction
y <- sample(nrow(df3), 10)
r3 <- rarecurve(df3[y,])

The following is an alternative way to show the rarefaction results

col <- c("black", "darkred", "forestgreen", "hotpink", "blue")
lty <- c("solid", "dashed", "dotdash")
lwd <- c(1, 2)
pars <- expand.grid(col = col, lty = lty, lwd = lwd, 
                    stringsAsFactors = FALSE)
head(pars)

out <- r1
raremax <- 1000
Nmax <- sapply(out, function(x) max(attr(x, "Subsample")))
Smax <- sapply(out, max)

plot(c(1, max(Nmax)), c(1, max(Smax)), main = "Gillet - Bat - Rarefaction",
     xlab = "Sample Size",
     ylab = "Species", type = "n", ylim = c(0,150)) 
#abline(v = raremax)
for (i in seq_along(out)) {
  N <- attr(out[[i]], "Subsample")
  with(pars, lines(N, out[[i]], col = col[i], lty = lty[i], lwd = lwd[i]))
}

out <- r2
raremax <- 1000
Nmax <- sapply(out, function(x) max(attr(x, "Subsample")))
Smax <- sapply(out, max)

plot(c(1, max(Nmax)), c(1, max(Smax)), main = "Gillet - GWTS - Rarefaction",
     xlab = "Sample Size",
     ylab = "Species", type = "n", ylim = c(0,150)) 
#abline(v = raremax)
for (i in seq_along(out)) {
  N <- attr(out[[i]], "Subsample")
  with(pars, lines(N, out[[i]], col = col[i], lty = lty[i], lwd = lwd[i]))
}

out <- r3
raremax <- 1000
Nmax <- sapply(out, function(x) max(attr(x, "Subsample")))
Smax <- sapply(out, max)

plot(c(1, max(Nmax)), c(1, max(Smax)), main = "Gillet - Pygmy - Rarefaction",
     xlab = "Sample Size",
     ylab = "Species", type = "n", ylim = c(0,150)) 
#abline(v = raremax)
for (i in seq_along(out)) {
  N <- attr(out[[i]], "Subsample")
  with(pars, lines(N, out[[i]], col = col[i], lty = lty[i], lwd = lwd[i]))
}

2.1.11. Number of identified taxa

The following is a chunky code to count the MOTUs that were correctly assigned to taxonomy at various levels, accoring to this studies criteria set and specified in the main text.

It doe not count when the taxonomy contains “_sp." etc in the name

final.diet.g <- final.diet

# Extract taxonomy table
df1 <- as.data.frame(as.matrix(tax_table(final.diet.g)))
df <- df1

df$pident <- as.character(df$pident)
df$pident <- as.numeric(df$pident)
###################
## Species
n <- which(df[,"pident"] > 97.9999)
gn <- grep("Genus_", df[n,"species"])
fm <- grep("Family_", df[n,"species"])
or <- grep("Order_", df[n,"species"])
cl <- grep("Class_", df[n,"species"])
ph <- grep("Phylum_", df[n,"species"])
kg <- grep("Kindgom_", df[n,"species"])
nn <- grep("none", df[n,"species"])
s.p <- grep("_sp._", df[n,"species"])
n.r <- grep("_nr._", df[n,"species"])
c.f <- grep("_cf._", df[n,"species"])
x <- length(c(gn,fm,or,cl,ph,kg,nn,s.p,n.r,c.f))
sp.y <- length(df[n,"species"]) - x
chck <- df[n[-c(gn,fm,or,cl,ph,kg,nn,s.p,n.r,c.f)],"species"]
un.sp.y <- length(unique(chck))

###################
## Genus
df <- df[-n,]
n <- which(df[,"pident"] > 94.9999)
gn <- grep("Genus_", df[n,"genus"])
fm <- grep("Family_", df[n,"genus"])
or <- grep("Order_", df[n,"genus"])
cl <- grep("Class_", df[n,"genus"])
ph <- grep("Phylum_", df[n,"genus"])
kg <- grep("Kindgom_", df[n,"genus"])
nn <- grep("none", df[n,"genus"])
s.p <- grep("_sp._", df[n,"genus"])
n.r <- grep("_nr._", df[n,"genus"])
c.f <- grep("_cf._", df[n,"genus"])
g.g <- grep("_gen._", df[n,"genus"])
x <- length(c(gn,fm,or,cl,ph,kg,nn,s.p,n.r,c.f,g.g))
gn.y <- length(df[n,"genus"]) - x

gn <- grep("Genus_", df1[,"genus"])
fm <- grep("Family_", df1[,"genus"])
or <- grep("Order_", df1[,"genus"])
cl <- grep("Class_", df1[,"genus"])
ph <- grep("Phylum_", df1[,"genus"])
kg <- grep("Kindgom_", df1[,"genus"])
nn <- grep("none", df1[,"genus"])
s.p <- grep("_sp._", df1[,"genus"])
n.r <- grep("_nr._", df1[,"genus"])
c.f <- grep("_cf._", df1[,"genus"])
g.g <- grep("_gen._", df1[,"genus"])
chck <- df1[-c(gn,fm,or,cl,ph,kg,nn,s.p,n.r,c.f),"genus"]
un.gn.y <- length(unique(chck))

###################
## Family
df <- df[-n,]
n <- which(df[,"pident"] > 92.999)
gn <- grep("Genus_", df[n,"family"])
fm <- grep("Family_", df[n,"family"])
or <- grep("Order_", df[n,"family"])
cl <- grep("Class_", df[n,"family"])
ph <- grep("Phylum_", df[n,"family"])
kg <- grep("Kindgom_", df[n,"family"])
nn <- grep("none", df[n,"family"])
s.p <- grep("_sp._", df[n,"family"])
n.r <- grep("_nr._", df[n,"family"])
c.f <- grep("_cf._", df[n,"family"])
g.g <- grep("_gen._", df[n,"family"])
x <- length(c(gn,fm,or,cl,ph,kg,nn,s.p,n.r,c.f,g.g))
fm.y <- length(df[n,"family"]) - x

gn <- grep("Genus_", df1[,"family"])
fm <- grep("Family_", df1[,"family"])
or <- grep("Order_", df1[,"family"])
cl <- grep("Class_", df1[,"family"])
ph <- grep("Phylum_", df1[,"family"])
kg <- grep("Kindgom_", df1[,"family"])
nn <- grep("none", df1[,"family"])
s.p <- grep("_sp._", df1[,"family"])
n.r <- grep("_nr._", df1[,"family"])
c.f <- grep("_cf._", df1[,"family"])
g.g <- grep("_gen._", df1[,"family"])
chck <- df1[-c(gn,fm,or,cl,ph,kg,nn,s.p,n.r,c.f),"family"]
un.fm.y <- length(unique(chck))

###################
## Order
df <- df[-n,]
n <- which(df[,"pident"] > 89.9999)
gn <- grep("Genus_", df[n,"order"])
fm <- grep("Family_", df[n,"order"])
or <- grep("Order_", df[n,"order"])
cl <- grep("Class_", df[n,"order"])
ph <- grep("Phylum_", df[n,"order"])
kg <- grep("Kindgom_", df[n,"order"])
nn <- grep("none", df[n,"order"])
s.p <- grep("_sp._", df[n,"order"])
n.r <- grep("_nr._", df[n,"order"])
c.f <- grep("_cf._", df[n,"order"])
g.g <- grep("_gen._", df[n,"order"])
x <- length(c(gn,fm,or,cl,ph,kg,nn,s.p,n.r,c.f,g.g))
or.y <- length(df[n,"order"]) - x

gn <- grep("Genus_", df1[,"order"])
fm <- grep("Family_", df1[,"order"])
or <- grep("Order_", df1[,"order"])
cl <- grep("Class_", df1[,"order"])
ph <- grep("Phylum_", df1[,"order"])
kg <- grep("Kindgom_", df1[,"order"])
nn <- grep("none", df1[,"order"])
s.p <- grep("_sp._", df1[,"order"])
n.r <- grep("_nr._", df1[,"order"])
c.f <- grep("_cf._", df1[,"order"])
g.g <- grep("_gen._", df1[,"order"])
chck <- df1[-c(gn,fm,or,cl,ph,kg,nn,s.p,n.r,c.f),"order"]
un.or.y <- length(unique(chck))

df <- df1

#un.gn.y <- length(unique(df1[,"genus"]))
#un.fm.y <- length(unique(df1[,"family"]))
#un.or.y <- length(unique(df1[,"order"])) 

# Create a table to save the results

tabx <- data.frame(primer = NA, 
                   order = NA, unique.orders = NA, 
                   family = NA, unique.families = NA,
                   genus = NA, unique.genus = NA,
                   species = NA, unique.species = NA,
                   total.taxa = NA,
                   total.reads = NA)

# Save results into the table/dataframe

tabx[1,] <- c("Gillet95", or.y, un.or.y, fm.y, un.fm.y, 
              gn.y, un.gn.y, sp.y, un.sp.y, 
              ntaxa(final.diet.g), sum(sample_sums(final.diet.g)))

Now that results have been saved in the dataframe, we repeat with each dataset and save the results in “tabx” dataframe


2.2. Rarefaction Plots from clustering sequences at 96% - 98%

2.2.1. 96% Clustering

set.seed(57)
# Bat Rarefaction Curve
x <- sample(nrow(df1), 10)
r4 <- rarecurve(df1[x,])

# GWTS Rarefaction Curve
#y <- sample(nrow(df2), 10)
r5 <- rarecurve(df2[,])

# Pygmy Rarefaction Curve
y <- sample(nrow(df3), 10)
r6 <- rarecurve(df3[y,])

2.2.2. 97% Clustering

set.seed(57)
# Bat Rarefaction Curve
x <- sample(nrow(df1), 10)
r7 <- rarecurve(df1[x,])

# GWTS Rarefaction Curve
#y <- sample(nrow(df2), 10)
r8 <- rarecurve(df2[,])

# Pygmy Rarefaction Curve
y <- sample(nrow(df3), 10)
r9 <- rarecurve(df3[y,])

2.2.3. 98% Clustering

set.seed(57)
# Bat Rarefaction Curve
x <- sample(nrow(df1), 10)
r10 <- rarecurve(df1[x,])

# GWTS Rarefaction Curve
#y <- sample(nrow(df2), 10)
r11 <- rarecurve(df2[,])

# Pygmy Rarefaction Curve
y <- sample(nrow(df3), 10)
r12 <- rarecurve(df3[y,])

2.3. Composition of diet - Comparison

gridExtra::grid.arrange(p2.95,
                        p2.96,
                        p2.97,
                        p2.98,
                        nrow = 2)
Comparing final diet composition when sequences are clustered at different thresholds: 95% - 98%

Comparing final diet composition when sequences are clustered at different thresholds: 95% - 98%

2.4. Taxonomic Assignment at different cluster thresholds

kable(tabx, caption = "Number of MOTUs assigned to taxonomy of different levels") %>%
  kable_styling(bootstrap_options = "striped",
                full_width = F,
                position = "left")
Number of MOTUs assigned to taxonomy of different levels
primer order unique.orders family unique.families genus unique.genus species unique.species total.taxa total.reads
Gillet95 72 24 82 120 72 207 217 211 540 1671217
Gillet96 71 25 79 123 116 216 233 220 606 1668404
Gillet97 82 25 85 123 180 218 271 228 736 1663063
Gillet98 95 25 107 125 228 223 371 232 945 1652436


3. Zeale

3.1. Details

3.1.1. Number of Samples and MOTUs

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2222 taxa and 51 samples ]
## sample_data() Sample Data:       [ 51 samples by 9 sample variables ]
## tax_table()   Taxonomy Table:    [ 2222 taxa by 10 taxonomic ranks ]

3.1.2. Table: MOTUs in Blanks

MOTUs identified in the blanks
order family genus species total.reads Neg1.reads Neg1.prop Neg2.reads Neg2.prop perc
Diptera Tipulidae Tipula Genus_Tipula 111029 170 0.1531132 0 0.0000000 0.1531132
Diptera Tipulidae Tipula Genus_Tipula 10 10 100.0000000 0 0.0000000 100.0000000
Diptera Tipulidae Family_Tipulidae Family_Tipulidae 11842 0 0.0000000 1 0.0084445 0.0084445

3.1.3. Figure: Proportion of Host Reads in Samples

Composition of prey reads compared to host - clustered at 95%

Composition of prey reads compared to host - clustered at 95%


3.1.4. Filtered Dataset

# remove samples with less than 1000 reads
sampl.filt <- prune_samples(sample_sums(diet.prey) > 1000, diet.prey)
# Extract OTU table
otu.tab <- as.data.frame(otu_table(sampl.filt))
# Remove MOTUs from samples that have < 0.01% total reads of that sample
new.otu.tab <- hilldiv::copy_filt(otu.tab, 0.0001)
# Assign cleaned matrix to phyloseq object
new.otu.tab <- as.matrix(new.otu.tab)
otu_table(sampl.filt) <- otu_table(new.otu.tab, taxa_are_rows = TRUE)
# Check amount of remaining MOTUs
sampl.filt
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2115 taxa and 38 samples ]
## sample_data() Sample Data:       [ 38 samples by 9 sample variables ]
## tax_table()   Taxonomy Table:    [ 2115 taxa by 10 taxonomic ranks ]
# Remove MOTUs that are represented by < 5 reads in total
final.diet <- prune_taxa(taxa_sums(sampl.filt) > 4, sampl.filt)
# The final dataset is...
final.diet
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 417 taxa and 38 samples ]
## sample_data() Sample Data:       [ 38 samples by 9 sample variables ]
## tax_table()   Taxonomy Table:    [ 417 taxa by 10 taxonomic ranks ]

3.1.5. Read Depth of Samples

# Range of sample depth of samples
range(sample_sums(final.diet))
## [1]   2953 620979
# Range of read depth of taxa
range(taxa_sums(final.diet))
## [1]      5 526148
# Average read depth per sample
mean(sample_sums(final.diet))
## [1] 160684.3
# hill_div packages assessment taxa coverage per sample, according to a shannon diversity equivilent
depth_cov(new.otu.tab, 
          qvalue = 1)
##         Depth Observed Estimated Coverage
## Z_10   206188     6.06      6.06   100.00
## Z_11   252639     4.49      4.49    99.99
## Z_12   200456     3.49      3.50    99.99
## Z_13   241716     3.04      3.04    99.99
## Z_14    85475     6.01      6.01    99.99
## Z_15   246599     7.41      7.41    99.99
## Z_16     2978     7.43      7.52    98.76
## Z_17   235886     5.10      5.10   100.00
## Z_18     4607     2.14      2.14    99.80
## Z_19   345250     4.22      4.22   100.00
## Z_2     11971     4.04      4.04    99.95
## Z_20     7162     3.53      3.54    99.54
## Z_21   213317     9.97      9.98    99.99
## Z_22   264815     1.53      1.53   100.00
## Z_23    29968     7.92      7.92    99.95
## Z_24   310608     5.62      5.62    99.99
## Z_3     21562     1.94      1.94    99.98
## Z_4     13201     4.85      4.85    99.97
## Z_5    204439     4.19      4.19    99.99
## Z_6    140527     6.79      6.79    99.99
## Z_7    212066     6.73      6.73    99.99
## Z_8      4466     7.92      8.00    98.96
## Z_9     96952     1.80      1.80   100.00
## Z_E26   83067     2.19      2.19   100.00
## Z_E29  225498     1.22      1.22   100.00
## Z_E31   16701     1.10      1.10    99.97
## Z_E32  278483     2.88      2.88   100.00
## Z_E33   57603     1.11      1.11    99.98
## Z_FR14 203171     2.21      2.21   100.00
## Z_FR15 239751     3.49      3.49    99.99
## Z_FR16 620979     1.93      1.93   100.00
## Z_FR20  10716     2.11      2.11    99.92
## Z_FR21 166760     1.98      1.98   100.00
## Z_S19   35303     2.29      2.29    99.99
## Z_S21  433836     1.31      1.31   100.00
## Z_S50   97089     1.75      1.75   100.00
## Z_S51  205273     3.32      3.32   100.00
## Z_W13   79030     1.03      1.03   100.00


3.2. Rarefaction


3.2.1. 95% Clustering

# Grab random subset of 10 individuals from each mammal species to test
set.seed(57)
# Bat sample rarefaction
x <- sample(nrow(df1), 10)
r1 <- rarecurve(df1[x,])

# GWTS sample rarefaction
#y <- sample(nrow(df2), 10)
r2 <- rarecurve(df2[,])

# Pygmy sample rarefaction
y <- sample(nrow(df3), 10)
r3 <- rarecurve(df3[y,])


3.2.2. 96% Clustering

set.seed(57)
# Bat Rarefaction Curve
x <- sample(nrow(df1), 10)
r4 <- rarecurve(df1[x,])

# GWTS Rarefaction Curve
#y <- sample(nrow(df2), 10)
r5 <- rarecurve(df2[,])

# Pygmy Rarefaction Curve
y <- sample(nrow(df3), 10)
r6 <- rarecurve(df3[y,])


3.2.3. 97% Clustering

set.seed(57)
# Bat Rarefaction Curve
x <- sample(nrow(df1), 10)
r7 <- rarecurve(df1[x,])

# GWTS Rarefaction Curve
#y <- sample(nrow(df2), 10)
r8 <- rarecurve(df2[,])

# Pygmy Rarefaction Curve
y <- sample(nrow(df3), 10)
r9 <- rarecurve(df3[y,])


3.2.3. 98% Clustering

set.seed(57)
# Bat Rarefaction Curve
x <- sample(nrow(df1), 10)
r10 <- rarecurve(df1[x,])

# GWTS Rarefaction Curve
#y <- sample(nrow(df2), 10)
r11 <- rarecurve(df2[,])

# Pygmy Rarefaction Curve
y <- sample(nrow(df3), 10)
r12 <- rarecurve(df3[y,])


3.3. Composition of diet - Comparison

gridExtra::grid.arrange(p2.95z,
                        p2.96z,
                        p2.97z,
                        p2.98z,
                        nrow = 2)
Comparing final diet composition when sequences are clustered at different thresholds: 95% - 98%

Comparing final diet composition when sequences are clustered at different thresholds: 95% - 98%


3.4. Taxonomic Assignment at different cluster thresholds

kable(tabx, caption = "Number of MOTUs assigned to taxonomy of different levels") %>%
  kable_styling(bootstrap_options = "striped",
                full_width = F,
                position = "left")
Number of MOTUs assigned to taxonomy of different levels
primer order unique.orders family unique.families genus unique.genus species unique.species total.taxa total.reads
Gillet95 72 24 82 120 72 207 217 211 540 1671217
Gillet96 71 25 79 123 116 216 233 220 606 1668404
Gillet97 82 25 85 123 180 218 271 228 736 1663063
Gillet98 95 25 107 125 228 223 371 232 945 1652436
Zeale95 36 14 95 85 113 162 130 127 417 6106005
Zeale96 39 14 90 82 173 172 142 133 499 6104836
Zeale97 44 13 125 85 279 189 217 146 750 6087051
Zeale98 50 13 125 84 325 195 297 158 927 6058860


4. Compare Tax Assignment: Gillet vs Zeale


4.1. Number of MOTUs taxonomically assigned

tabx$primer <- c(rep("Gillet", 4), rep("Zeale", 4))
df <- melt(data = tabx, id.vars = "primer",
           measure.vars = c("order", 
                            "family", 
                            "genus", 
                            "species"))
df$clust <- rep(c("95%", "96%", "97%", "98%"), 8)
df$variable <- paste(df$variable, df$clust, sep = " ")

df$value <- as.numeric(df$value)
df$primer <- factor(df$primer, levels = c("Zeale", "Gillet"))
df$variable <- factor(df$variable, levels = c("order 95%",
                            "order 96%",
                            "order 97%",
                            "order 98%",
                            "family 95%",
                            "family 96%",
                            "family 97%",
                            "family 98%",
                            "genus 95%",
                            "genus 96%",
                            "genus 97%",
                            "genus 98%",
                            "species 95%",
                            "species 96%",
                            "species 97%",
                            "species 98%"))

p <- ggplot(df, aes(x = variable, y=value)) +
  geom_bar(aes(fill = primer), 
           stat = "identity", 
           color = "black", 
           position = position_dodge()) +
  theme_classic() +
  scale_fill_manual(values = c("darkblue", "pink")) +
  scale_x_discrete(labels=c("order 95%" = "Order - 95%",
                            "order 96%" = "Order - 96%",
                            "order 97%" = "Order - 97%",
                            "order 98%" = "Order - 98%",
                            "family 95%" = "Family - 95%",
                            "family 96%" = "Family - 96%",
                            "family 97%" = "Family - 97%",
                            "family 98%" = "Family - 98%",
                            "genus 95%" = "Genus - 95%",
                            "genus 96%" = "Genus - 96%",
                            "genus 97%" = "Genus - 97%",
                            "genus 98%" = "Genus - 98%",
                            "species 95%" = "Species - 95%",
                            "species 96%" = "Species - 96%",
                            "species 97%" = "Species - 97%",
                            "species 98%" = "Species - 98%")) +
  theme(axis.text.x = element_text(size = 12, angle = 45, hjust = 1,
                                   face = "bold"),
        axis.text.y = element_text(size = 12, face = "bold")) +
  labs(y = "Number Identified") +
  theme(legend.position = c(0.15,0.8)) +
  theme(legend.title = element_blank(),
        legend.text = element_text(size = 20)) +
  #theme(legend.position = "bottom") +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_text(size = 15, face = "bold"))

p


4.2. Proportion of MOTUs taxonomically assigned

tabprop <- tabx
tabprop[2:11] <- sapply(tabprop[2:11],as.numeric)
for (i in 1:8){
  tabprop[i,"order"] <- (tabprop[i,"order"]/tabprop[i, "total.taxa"]) * 100
  tabprop[i,"family"] <- (tabprop[i,"family"]/tabprop[i, "total.taxa"]) * 100
  tabprop[i,"genus"] <- (tabprop[i,"genus"]/tabprop[i, "total.taxa"]) * 100
  tabprop[i,"species"] <- (tabprop[i,"species"]/tabprop[i, "total.taxa"]) * 100
  }

df <- melt(data = tabprop, id.vars = "primer",
           measure.vars = c("order", 
                            "family", 
                            "genus", 
                            "species"))
df$clust <- rep(c("95%", "96%", "97%", "98%"), 8)
df$variable <- paste(df$variable, df$clust, sep = " ")

df$value <- as.numeric(df$value)
df$primer <- factor(df$primer, levels = c("Zeale", "Gillet"))
df$variable <- factor(df$variable, levels = c("order 95%",
                                              "order 96%",
                                              "order 97%",
                                              "order 98%",
                                              "family 95%",
                                              "family 96%",
                                              "family 97%",
                                              "family 98%",
                                              "genus 95%",
                                              "genus 96%",
                                              "genus 97%",
                                              "genus 98%",
                                              "species 95%",
                                              "species 96%",
                                              "species 97%",
                                              "species 98%"))

p2 <- ggplot(df, aes(x = variable, y=value)) +
  geom_bar(aes(fill = primer), 
           stat = "identity", 
           color = "black", 
           position = position_dodge()) +
  theme_classic() +
  scale_fill_manual(values = c("darkblue", "pink")) +
  scale_x_discrete(labels=c("order 95%" = "Order - 95%",
                            "order 96%" = "Order - 96%",
                            "order 97%" = "Order - 97%",
                            "order 98%" = "Order - 98%",
                            "family 95%" = "Family - 95%",
                            "family 96%" = "Family - 96%",
                            "family 97%" = "Family - 97%",
                            "family 98%" = "Family - 98%",
                            "genus 95%" = "Genus - 95%",
                            "genus 96%" = "Genus - 96%",
                            "genus 97%" = "Genus - 97%",
                            "genus 98%" = "Genus - 98%",
                            "species 95%" = "Species - 95%",
                            "species 96%" = "Species - 96%",
                            "species 97%" = "Species - 97%",
                            "species 98%" = "Species - 98%")) +
  theme(axis.text.x = element_text(size = 12, angle = 45, hjust = 1,
                                   face = "bold"),
        axis.text.y = element_text(size = 12, face = "bold")) +
  labs(y = "Proportion (%) Identified") +
  ggtitle("Proportion of MOTUs identified") +
  theme(legend.position = c(0.15,0.8)) +
  theme(legend.title = element_blank(),
        legend.text = element_text(size = 20)) +
  #theme(legend.position = "bottom") +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_text(size = 15, face = "bold"))
p2